其他
一文搞定相关性分析
一、先谈谈概念性的东西:
1、何为相关性:
2、何为相关性分析:
##### 构建一个数据集 #####
suppressMessages(library(ggplot2))
mydata <- as.data.frame(cbind(1:10, 2:11))
head(mydata)
## V1 V2
## 1 1 2
## 2 2 3
## 3 3 4
## 4 4 5
## 5 5 6
## 6 6 7
#### 做个线性回归的计算
lm.model <- lm(V1 ~ V2,data = mydata)
summary(lm.model)
## Warning in summary.lm(lm.model): essentially perfect fit: summary may be
## unreliable
##
## Call:
## lm(formula = V1 ~ V2, data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.661e-16 -1.157e-16 4.273e-17 2.153e-16 4.167e-16
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.000e+00 2.815e-16 -3.553e+15 <2e-16 ***
## V2 1.000e+00 3.961e-17 2.525e+16 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.598e-16 on 8 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 6.374e+32 on 1 and 8 DF, p-value: < 2.2e-16
### 把斜率a和截距b取出来
a <- lm.model$fitted.values[1]
b <- lm.model$fitted.values[2]
a
## 1
## 1
b
## 2
## 2
ggplot(data = mydata,aes(V1,V2))+geom_point()+geom_line()+
ggtitle(label = paste("y = ", a, " * x + ", b, sep = ""))
#可以看到,线性相关性是比较强的
3、相关系数最常见的两种算法
(1)、Pearson相关系数
(2)、Spearman相关系数
二、R语言实现及可视化——ggcorrplot
########## ggcorrplot 用法 ############3
#参考:https://github.com/kassambara/ggcorrplot
if(!require("ggcorrplot"))install.packages("ggcorrplot")
data(mtcars)#加载数据
class(mtcars)
## [1] "data.frame"
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
corr <- round(cor(mtcars), 1)#计算每列变量间的相关性
head(corr[, 1:6])
## mpg cyl disp hp drat wt
## mpg 1.0 -0.9 -0.8 -0.8 0.7 -0.9
## cyl -0.9 1.0 0.9 0.8 -0.7 0.8
## disp -0.8 0.9 1.0 0.8 -0.7 0.9
## hp -0.8 0.8 0.8 1.0 -0.4 0.7
## drat 0.7 -0.7 -0.7 -0.4 1.0 -0.7
## wt -0.9 0.8 0.9 0.7 -0.7 1.0
p.mat <- cor_pmat(mtcars)
head(p.mat[, 1:4])
## mpg cyl disp hp
## mpg 0.000000e+00 6.112687e-10 9.380327e-10 1.787835e-07
## cyl 6.112687e-10 0.000000e+00 1.802838e-12 3.477861e-09
## disp 9.380327e-10 1.802838e-12 0.000000e+00 7.142679e-08
## hp 1.787835e-07 3.477861e-09 7.142679e-08 0.000000e+00
## drat 1.776240e-05 8.244636e-06 5.282022e-06 9.988772e-03
## wt 1.293959e-10 1.217567e-07 1.222320e-11 4.145827e-05
ggcorrplot(corr)#最简单的相关性热图
ggcorrplot(corr, method = "circle")#让热图的方框变成圆圈
ggcorrplot(corr, hc.order = TRUE, outline.color = "white")#设置边界颜色
ggcorrplot(corr,
hc.order = TRUE,#设置是否聚类
type = "lower",#设置三角形对着的方式
outline.color = "white")
ggcorrplot(
corr,
hc.order = TRUE,
type = "lower",
outline.color = "white",
ggtheme = ggplot2::theme_gray,
colors = c("#6D9EC1", "white", "#E46726")
)#设置主题和颜色
ggcorrplot(corr,
hc.order = TRUE,
type = "lower",
lab = TRUE)#是否展示C值
ggcorrplot(corr,
hc.order = TRUE,
type = "lower",
p.mat = p.mat)#是否展示p值
ggcorrplot(
corr,
p.mat = p.mat,
hc.order = TRUE,
type = "lower",
insig = "blank"
)#是否将不显著的色块移除
参考:
[1]: 高祖新 言方荣 医药统计分析与SPSS软件应用[M].北京:人民卫生出版社,2018:169-187.
[2]: https://github.com/kassambara/ggcorrplot